Load libraries
suppressMessages(library(tidyverse))
suppressMessages(library(downloader))
suppressMessages(library(plotly))
suppressMessages(library(car))
suppressMessages(library(dunn.test))
suppressMessages(library(broom))
gapminder_clean.csv data as a tibble using read_csvdat <- read.csv('gapminder_clean.csv')
dim(dat)
## [1] 2607 20
The gapminder data consists of a matrix with 2607 rows and 20 columns.
1962 and then make a scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap for the filtered datadat %>%
filter(Year == 1962) %>%
# remove rows which contain NAs
filter(across(c(CO2.emissions..metric.tons.per.capita., gdpPercap), ~ !is.na(.))) %>%
ggplot(aes(x = gdpPercap, y = CO2.emissions..metric.tons.per.capita.)) +
geom_point() +
ylab("CO2 emissions (metric tons per capita)")
# Because of one very large outlier the analysis repreated with log transformed data
dat %>%
filter(Year == 1962) %>%
# remove rows which contain NAs
filter(across(c(CO2.emissions..metric.tons.per.capita., gdpPercap), ~ !is.na(.))) %>%
ggplot(aes(x = log(gdpPercap), y = log(CO2.emissions..metric.tons.per.capita.))) +
geom_point() +
ylab("log(CO2 emissions (metric tons per capita))")
Log transformed data can be used to make data as “normal” as possible so that the statistical analysis results become more valid.
‘CO2 emissions (metric tons per capita)’ and gdpPercap. What is the Pearson R value and associated p value?dat %>%
filter(Year == 1962) %>%
# remove rows which contain NAs
filter(across(c(CO2.emissions..metric.tons.per.capita., gdpPercap), ~ !is.na(.))) %>%
pivot_longer( gdpPercap, names_to = "x_var", values_to = "x_val") %>%
pivot_longer( CO2.emissions..metric.tons.per.capita., names_to = "y_var", values_to = "y_val") %>%
group_by(x_var, y_var) %>%
summarise(cor_coef = cor.test(x_val, y_val)$estimate,
p_val = cor.test(x_val, y_val)$p.value)
## `summarise()` has grouped output by 'x_var'. You can override using the `.groups` argument.
## # A tibble: 1 x 4
## # Groups: x_var [1]
## x_var y_var cor_coef p_val
## <chr> <chr> <dbl> <dbl>
## 1 gdpPercap CO2.emissions..metric.tons.per.capita. 0.926 1.13e-46
The Pearson R value of ‘CO2 emissions (metric tons per capita)’ and gdpPercap in Year 1962 is 0.9260817 and the associated p value is 1.128679e-46.
# Because of one very large outlier the analysis repreated with log transformed data
dat %>%
filter(Year == 1962) %>%
# remove rows which contain NAs
filter(across(c(CO2.emissions..metric.tons.per.capita., gdpPercap), ~ !is.na(.))) %>%
pivot_longer( gdpPercap, names_to = "x_var", values_to = "x_val") %>%
pivot_longer( CO2.emissions..metric.tons.per.capita., names_to = "y_var", values_to = "y_val") %>%
group_by(x_var, y_var) %>%
# calculate pearson R value and associated p value
summarise(cor_coef = cor.test(log(x_val), log(y_val))$estimate,
p_val = cor.test(log(x_val), log(y_val))$p.value)
## `summarise()` has grouped output by 'x_var'. You can override using the `.groups` argument.
## # A tibble: 1 x 4
## # Groups: x_var [1]
## x_var y_var cor_coef p_val
## <chr> <chr> <dbl> <dbl>
## 1 gdpPercap CO2.emissions..metric.tons.per.capita. 0.860 8.90e-33
The Pearson R value of log transformed ‘CO2 emissions (metric tons per capita)’ and gdpPercap in Year 1962 is 0.8602081 and the associated p value is 8.903567e-33.
‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest?” Filter the dataset to that year for the next step…year.cor.max <- dat %>%
# remove rows which contain NAs
filter(across(c(CO2.emissions..metric.tons.per.capita., gdpPercap), ~ !is.na(.))) %>%
# split origial tibble in lists depending on the year
split(.$Year) %>%
# perform pearson correlation test on each year list, keep only the pearson R value
map(~cor.test(log(.$CO2.emissions..metric.tons.per.capita.), log(.$gdpPercap))$estimate) %>%
# merge list of lists to one final list
unlist
year.cor.max
## 1962.cor 1967.cor 1972.cor 1977.cor 1982.cor 1987.cor 1992.cor 1997.cor
## 0.8602081 0.8736773 0.8843802 0.9014278 0.9151610 0.9080487 0.8934602 0.9208183
## 2002.cor 2007.cor
## 0.9300017 0.9275425
year.cor.max[which.max(year.cor.max)]
## 2002.cor
## 0.9300017
In year 2002 is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest.
plotly, create an interactive scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent. You can easily convert any ggplot plot to a plotly plot using the ggplotly() command.viz <- dat %>%
filter(Year == 2002) %>%
# remove rows which contain NAs
filter(across(c(CO2.emissions..metric.tons.per.capita., gdpPercap), ~ !is.na(.))) %>%
ggplot(aes(x = log(gdpPercap), y = log(CO2.emissions..metric.tons.per.capita.), text=paste0('</br>log(gdpPercap): ', gdpPercap, '</br>log(CO2 emissions (metric tons per capita)): ', CO2.emissions..metric.tons.per.capita., '</br>population: ', pop, '</br>continent: ', continent, '</br>Country Name: ', Country.Name))) +
geom_point(aes(size = pop, color = continent)) +
ylab("log(CO2 emissions (metric tons per capita))")
ggplotly(viz, tooltip = c("text"))
continent and ‘Energy use (kg of oil equivalent per capita)’?First we must choose a suitable statistical test. We have continuous data and want to check for differences in mean for more than two groups.
Therefore, we have to check if the parametric assumptions are satisfied. For ANOVA, we have four parametric assumptions that must be met:
dat %>%
filter(continent != "") %>%
# remove rows which contain NAs
filter(across(Energy.use..kg.of.oil.equivalent.per.capita., ~ !is.na(.))) %>%
ggplot(aes(x=continent, y=Energy.use..kg.of.oil.equivalent.per.capita., fill = continent)) +
geom_boxplot() +
ylab("Energy use (kg of oil equivalent per capita)")
First, exploratory data analysis (EDA) was performed on the data. We can say that the samples are independent.
dat %>%
filter(continent != "") %>%
# remove rows which contain NAs
filter(across(Energy.use..kg.of.oil.equivalent.per.capita., ~ !is.na(.))) %>%
# perform Levene’s test
do(tidy(leveneTest(.$Energy.use..kg.of.oil.equivalent.per.capita.~.$continent, data = .)))
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## # A tibble: 1 x 4
## statistic p.value df df.residual
## <dbl> <dbl> <int> <int>
## 1 12.4 8.00e-10 4 843
We reject the H0 hypothesis that the variances are equal based on the p value of our Levene’s test. Therefore, we can not use an ANOVA test and must use a Kruskal-Wallis test instead.
dat %>%
filter(continent != "") %>%
# remove rows which contain NAs
filter(across(Energy.use..kg.of.oil.equivalent.per.capita., ~ !is.na(.))) %>%
# perform Kruskal-Wallis test
do(tidy(kruskal.test(.$Energy.use..kg.of.oil.equivalent.per.capita.~.$continent, data = .)))
## # A tibble: 1 x 4
## statistic p.value parameter method
## <dbl> <dbl> <int> <chr>
## 1 319. 1.01e-67 4 Kruskal-Wallis rank sum test
The resulting p value of our Kruskal-Wallis test shows that there is a significant difference in the mean for our data. Because of our significant result, we additionally perform a dunn’s test.
dat6 <- dat %>%
filter(continent != "") %>%
# remove rows which contain NAs
filter(across(Energy.use..kg.of.oil.equivalent.per.capita., ~ !is.na(.)))
# perform dunn's test
dunn.test(dat6$Energy.use..kg.of.oil.equivalent.per.capita., g=dat6$continent)
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 318.6763, df = 4, p-value = 0
##
##
## Comparison of x by group
## (No adjustment)
## Col Mean-|
## Row Mean | Africa Americas Asia Europe
## ---------+--------------------------------------------
## Americas | -6.174961
## | 0.0000*
## |
## Asia | -4.852592 1.278883
## | 0.0000* 0.1005
## |
## Europe | -16.43361 -9.630909 -10.95868
## | 0.0000* 0.0000* 0.0000*
## |
## Oceania | -8.148201 -5.456302 -6.014711 -1.543152
## | 0.0000* 0.0000* 0.0000* 0.0614
##
## alpha = 0.05
## Reject Ho if p <= alpha/2
‘Imports of goods and services (% of GDP)’ in the years after 1990?Again, we must choose a suitable test first. We have continuous data and want to check for differences in mean for two groups.
Therefore, we have to check if the parametric assumptions are satisfied. For Student’s unpaired t-test, we have four parametric assumptions that must be met:
dat %>%
filter(Year > 1990 & (continent == "Europe" | continent == "Asia")) %>%
# remove rows which contain NAs
filter(across(Imports.of.goods.and.services....of.GDP., ~ !is.na(.))) %>%
ggplot(aes(x=continent, y=Imports.of.goods.and.services....of.GDP., fill = continent)) +
geom_boxplot() +
ylab("Imports of goods and services (% of GDP)")
First, exploratory data analysis (EDA) was performed on the data. We can say that the samples are independent. We can also say that the dependent variable is measured on the incremental level year and that independent variables consist of matched pairs.
dat %>%
filter(Year > 1990 & (continent == "Europe" | continent == "Asia")) %>%
# remove rows which contain NAs
filter(across(Imports.of.goods.and.services....of.GDP., ~ !is.na(.))) %>%
# perform Shapiro-Wilk normality test
do(tidy(shapiro.test(.$Imports.of.goods.and.services....of.GDP.)))
## # A tibble: 1 x 3
## statistic p.value method
## <dbl> <dbl> <chr>
## 1 0.849 1.46e-13 Shapiro-Wilk normality test
We reject the H0 hypothesis that the data is normally distributed based on the p values of our Shapiro-Wilk normality test. Therefore, we can not use a Student’s unpaired t-test and must use a Wilcoxon Rank sums test instead.
dat %>%
filter(Year > 1990 & (continent == "Europe" | continent == "Asia")) %>%
# remove rows which contain NAs
filter(across(Imports.of.goods.and.services....of.GDP., ~ !is.na(.))) %>%
# perform Wilcoxon Rank sums test
do(tidy(wilcox.test(.$Imports.of.goods.and.services....of.GDP.~.$continent)))
## # A tibble: 1 x 4
## statistic p.value method alternative
## <dbl> <dbl> <chr> <chr>
## 1 5707 0.787 Wilcoxon rank sum test with continuity correcti~ two.sided
We reject the H0 hypothesis that there is a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990 based on the p values of our Wilcoxon Rank sums test.
‘Population density (people per sq. km of land area)’ across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)First, exploratory data analysis (EDA) was performed on the data.
viz <- dat %>%
# remove rows which contain NAs
filter(across(c(Year, Country.Name, Population.density..people.per.sq..km.of.land.area.), ~ !is.na(.))) %>%
ggplot(aes(x=Year, y=Population.density..people.per.sq..km.of.land.area., group = Country.Name)) +
geom_line() +
ylab("Population density (people per sq. km of land area)")
ggplotly(viz)
highest.number <- dat %>%
# remove rows which contain NAs
filter(across(c(Year, Country.Name, Population.density..people.per.sq..km.of.land.area.), ~ !is.na(.))) %>%
# split origial tibble in lists depending on the year
split(.$Year) %>%
# get maximum of variable in every year
map(~max(.$Population.density..people.per.sq..km.of.land.area.)) %>%
# merge list of lists into one list
unlist
# map highest scoring variable from every year to respective country
highest.ranked.countries <- data.frame(lapply(highest.number, function(x) dat$Country.Name[which(dat$Population.density..people.per.sq..km.of.land.area. == x)]), stringsAsFactors=FALSE)
# count occurences of highest scoring countries
apply(highest.ranked.countries, MARGIN=1, table)
## [,1]
## Macao SAR, China 5
## Monaco 5
The countries that have the highest ‘Population density (people per sq. km of land area)’ across all years are Macao SAR and Monaco.
‘Life expectancy at birth, total (years)’ since 1962?First, exploratory data analysis (EDA) was performed on the data.
viz <- dat %>%
# remove rows which contain NAs
filter(across(c(Year, Country.Name, Life.expectancy.at.birth..total..years.), ~ !is.na(.))) %>%
ggplot(aes(x=Year, y=Life.expectancy.at.birth..total..years., group = Country.Name)) +
geom_line() +
ylab("Life expectancy at birth, total (years)")
ggplotly(viz)
To get the difference for Life expectancy at birth, total (years) for every country between 1962 and 2007 we subtract the value from 2007 minus the value from 1962.
dat %>%
filter(Year == 1962 | Year == 2007) %>%
# remove rows which contain NAs
filter(across(c(Year, Country.Name, Life.expectancy.at.birth..total..years.), ~ !is.na(.))) %>%
group_by(Country.Name) %>%
# Calculate difference between 1962 and 2007
mutate(diff = Life.expectancy.at.birth..total..years. - lag(Life.expectancy.at.birth..total..years., default = 0)) %>%
select(Country.Name, diff) %>%
# keep only rows with calculated difference
filter(row_number() %% 2 == 0) %>%
# sort by difference
arrange(desc(diff))
## # A tibble: 236 x 2
## # Groups: Country.Name [236]
## Country.Name diff
## <chr> <dbl>
## 1 Maldives 36.9
## 2 Bhutan 33.2
## 3 Timor-Leste 31.1
## 4 Tunisia 30.9
## 5 Oman 30.8
## 6 Nepal 30.6
## 7 China 29.9
## 8 Yemen, Rep. 27.2
## 9 Saudi Arabia 26.7
## 10 Iran, Islamic Rep. 26.6
## # ... with 226 more rows
Country Maldives has shown the greatest increase in ‘Life expectancy at birth, total (years)’ since 1962.